In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib notebook
In [2]:
data = pd.read_csv("data/une_rt_m.tsv", sep="\t")
codes = pd.read_csv("data/wikipedia-iso-country-codes.csv")
In [3]:
data.head()
Out[3]:
s_adj,age,unit,sex,geo\time 2020M06 2020M05 2020M04 2020M03 2020M02 2020M01 2019M12 2019M11 2019M10 ... 1983M10 1983M09 1983M08 1983M07 1983M06 1983M05 1983M04 1983M03 1983M02 1983M01
0 NSA,TOTAL,PC_ACT,F,AT 6.3 5.6 5.2 4.7 4.1 4.4 4.0 3.9 4.2 ... : : : : : : : : : :
1 NSA,TOTAL,PC_ACT,F,BE 5.0 5.0 5.2 5.0 4.9 4.8 4.6 4.5 4.7 ... : : : : : : : : : :
2 NSA,TOTAL,PC_ACT,F,BG 3.9 4.2 4.7 4.1 4.2 4.3 3.9 3.8 3.6 ... : : : : : : : : : :
3 NSA,TOTAL,PC_ACT,F,CH : : : 4.8 4.5 4.4 4.1 4.0 4.2 ... : : : : : : : : : :
4 NSA,TOTAL,PC_ACT,F,CY 9.2 8.3 7.9 7.5 7.4 7.6 7.4 7.7 5.8 ... : : : : : : : : : :

5 rows × 451 columns

In [4]:
col = data["s_adj,age,unit,sex,geo\\time"]
In [5]:
data.drop("s_adj,age,unit,sex,geo\\time", axis=1, inplace=True)
In [6]:
data.columns = pd.to_datetime(list(data.columns), format='%YM%m ').date
In [7]:
data.insert(0, "s_adj,age,unit,sex,geo\\time", col)
data.head()
Out[7]:
s_adj,age,unit,sex,geo\time 2020-06-01 2020-05-01 2020-04-01 2020-03-01 2020-02-01 2020-01-01 2019-12-01 2019-11-01 2019-10-01 ... 1983-10-01 1983-09-01 1983-08-01 1983-07-01 1983-06-01 1983-05-01 1983-04-01 1983-03-01 1983-02-01 1983-01-01
0 NSA,TOTAL,PC_ACT,F,AT 6.3 5.6 5.2 4.7 4.1 4.4 4.0 3.9 4.2 ... : : : : : : : : : :
1 NSA,TOTAL,PC_ACT,F,BE 5.0 5.0 5.2 5.0 4.9 4.8 4.6 4.5 4.7 ... : : : : : : : : : :
2 NSA,TOTAL,PC_ACT,F,BG 3.9 4.2 4.7 4.1 4.2 4.3 3.9 3.8 3.6 ... : : : : : : : : : :
3 NSA,TOTAL,PC_ACT,F,CH : : : 4.8 4.5 4.4 4.1 4.0 4.2 ... : : : : : : : : : :
4 NSA,TOTAL,PC_ACT,F,CY 9.2 8.3 7.9 7.5 7.4 7.6 7.4 7.7 5.8 ... : : : : : : : : : :

5 rows × 451 columns

In [8]:
codes.head()
Out[8]:
English short name lower case Alpha-2 code Alpha-3 code Numeric code ISO 3166-2
0 Afghanistan AF AFG 4 ISO 3166-2:AF
1 Ã…land Islands AX ALA 248 ISO 3166-2:AX
2 Albania AL ALB 8 ISO 3166-2:AL
3 Algeria DZ DZA 12 ISO 3166-2:DZ
4 American Samoa AS ASM 16 ISO 3166-2:AS
In [9]:
codes.columns = ['country', 'C2', 'C3', 'numeric', 'iso']
codes.columns
Out[9]:
Index(['country', 'C2', 'C3', 'numeric', 'iso'], dtype='object')
In [10]:
data['C2'] = data['s_adj,age,unit,sex,geo\\time'].apply(lambda x: x.split(",")[-1])
data['age'] = data['s_adj,age,unit,sex,geo\\time'].apply(lambda x: x.split(",")[1])
data['unit'] = data['s_adj,age,unit,sex,geo\\time'].apply(lambda x: x.split(",")[2])
data['sex'] = data['s_adj,age,unit,sex,geo\\time'].apply(lambda x: x.split(",")[3])
data['s_adj'] = data['s_adj,age,unit,sex,geo\\time'].apply(lambda x: x.split(",")[0])
data.head()
Out[10]:
s_adj,age,unit,sex,geo\time 2020-06-01 2020-05-01 2020-04-01 2020-03-01 2020-02-01 2020-01-01 2019-12-01 2019-11-01 2019-10-01 ... 1983-05-01 1983-04-01 1983-03-01 1983-02-01 1983-01-01 C2 age unit sex s_adj
0 NSA,TOTAL,PC_ACT,F,AT 6.3 5.6 5.2 4.7 4.1 4.4 4.0 3.9 4.2 ... : : : : : AT TOTAL PC_ACT F NSA
1 NSA,TOTAL,PC_ACT,F,BE 5.0 5.0 5.2 5.0 4.9 4.8 4.6 4.5 4.7 ... : : : : : BE TOTAL PC_ACT F NSA
2 NSA,TOTAL,PC_ACT,F,BG 3.9 4.2 4.7 4.1 4.2 4.3 3.9 3.8 3.6 ... : : : : : BG TOTAL PC_ACT F NSA
3 NSA,TOTAL,PC_ACT,F,CH : : : 4.8 4.5 4.4 4.1 4.0 4.2 ... : : : : : CH TOTAL PC_ACT F NSA
4 NSA,TOTAL,PC_ACT,F,CY 9.2 8.3 7.9 7.5 7.4 7.6 7.4 7.7 5.8 ... : : : : : CY TOTAL PC_ACT F NSA

5 rows × 456 columns

In [11]:
data['C2'].unique()
Out[11]:
array(['AT', 'BE', 'BG', 'CH', 'CY', 'CZ', 'DE', 'DK', 'EA', 'EA18',
       'EA19', 'EE', 'EL', 'ES', 'EU25', 'EU27_2007', 'EU27_2020', 'EU28',
       'FI', 'FR', 'HR', 'HU', 'IE', 'IS', 'IT', 'JP', 'LT', 'LU', 'LV',
       'MT', 'NL', 'NO', 'PL', 'PT', 'RO', 'SE', 'SI', 'SK', 'UK', 'US',
       'TR'], dtype=object)
In [12]:
data['age'].unique()
Out[12]:
array(['TOTAL', 'Y25-74', 'Y_LT25'], dtype=object)
In [13]:
data['unit'].unique()
# PC_ACT = Percentage of active population
# THS_PER = Thousand persons
Out[13]:
array(['PC_ACT', 'THS_PER'], dtype=object)
In [14]:
data['sex'].unique()
Out[14]:
array(['F', 'M', 'T'], dtype=object)
In [15]:
data['s_adj'].unique() # Seasonal adjustment
# NSA = Unadjusted data
# SA = Seasonally adjusted data
# TC = Trend cycle data
Out[15]:
array(['NSA', 'SA', 'TC'], dtype=object)
In [16]:
data.drop("s_adj,age,unit,sex,geo\\time", axis=1, inplace=True)
In [17]:
cols = ['C2','age','unit','sex', 's_adj']
In [18]:
data_t = data.drop(cols, axis=1)
data_t.head()
Out[18]:
2020-06-01 2020-05-01 2020-04-01 2020-03-01 2020-02-01 2020-01-01 2019-12-01 2019-11-01 2019-10-01 2019-09-01 ... 1983-10-01 1983-09-01 1983-08-01 1983-07-01 1983-06-01 1983-05-01 1983-04-01 1983-03-01 1983-02-01 1983-01-01
0 6.3 5.6 5.2 4.7 4.1 4.4 4.0 3.9 4.2 3.8 ... : : : : : : : : : :
1 5.0 5.0 5.2 5.0 4.9 4.8 4.6 4.5 4.7 5.0 ... : : : : : : : : : :
2 3.9 4.2 4.7 4.1 4.2 4.3 3.9 3.8 3.6 3.4 ... : : : : : : : : : :
3 : : : 4.8 4.5 4.4 4.1 4.0 4.2 4.9 ... : : : : : : : : : :
4 9.2 8.3 7.9 7.5 7.4 7.6 7.4 7.7 5.8 6.9 ... : : : : : : : : : :

5 rows × 450 columns

In [19]:
columns = data_t.columns[data_t.columns > pd.Timestamp('2010-06')]
In [20]:
cols
Out[20]:
['C2', 'age', 'unit', 'sex', 's_adj']
In [21]:
data_sel = data[columns]
for col in cols:
    data_sel.insert(0, col, data[col])
data_sel.head()
Out[21]:
s_adj sex unit age C2 2020-06-01 2020-05-01 2020-04-01 2020-03-01 2020-02-01 ... 2011-04-01 2011-03-01 2011-02-01 2011-01-01 2010-12-01 2010-11-01 2010-10-01 2010-09-01 2010-08-01 2010-07-01
0 NSA F PC_ACT TOTAL AT 6.3 5.6 5.2 4.7 4.1 ... 4.1 4.7 5.6 4.4 4.3 4.3 5.1 4.7 4.6 4.7
1 NSA F PC_ACT TOTAL BE 5.0 5.0 5.2 5.0 4.9 ... 6.6 6.6 6.8 7.2 7.7 8.0 8.4 8.8 9.2 9.0
2 NSA F PC_ACT TOTAL BG 3.9 4.2 4.7 4.1 4.2 ... 10.6 10.7 11.0 11.1 10.9 d 10.6 9.9 9.2 8.9 8.9
3 NSA F PC_ACT TOTAL CH : : : 4.8 4.5 ... 4.1 4.8 5.1 5.0 4.6 4.5 4.8 5.5 5.8 5.6
4 NSA F PC_ACT TOTAL CY 9.2 8.3 7.9 7.5 7.4 ... 6.9 7.4 7.9 7.5 6.3 6.0 4.9 5.0 5.9 6.6

5 rows × 125 columns

In [22]:
data_sel = data_sel.merge(codes, on="C2")
data_sel.head()
Out[22]:
s_adj sex unit age C2 2020-06-01 2020-05-01 2020-04-01 2020-03-01 2020-02-01 ... 2010-12-01 2010-11-01 2010-10-01 2010-09-01 2010-08-01 2010-07-01 country C3 numeric iso
0 NSA F PC_ACT TOTAL AT 6.3 5.6 5.2 4.7 4.1 ... 4.3 4.3 5.1 4.7 4.6 4.7 Austria AUT 40 ISO 3166-2:AT
1 NSA M PC_ACT TOTAL AT 6.1 6.3 5.2 5.0 5.5 ... 4.4 4.4 4.0 4.8 4.7 5.0 Austria AUT 40 ISO 3166-2:AT
2 NSA T PC_ACT TOTAL AT 6.2 6.0 5.2 4.9 4.9 ... 4.4 4.4 4.5 4.8 4.7 4.8 Austria AUT 40 ISO 3166-2:AT
3 NSA F THS_PER TOTAL AT 136 116 107 97 88 ... 83 85 101 93 91 92 Austria AUT 40 ISO 3166-2:AT
4 NSA M THS_PER TOTAL AT 146 149 121 118 133 ... 99 99 91 111 109 113 Austria AUT 40 ISO 3166-2:AT

5 rows × 129 columns

In [23]:
data_sel.shape
Out[23]:
(1645, 129)
In [24]:
data_tr = data_sel.melt(id_vars=["country", "age", "unit", "sex", "s_adj", 
                                 "C2", "C3", "numeric", "iso"], 
                        var_name="Date", value_name="Value")
data_tr.head()
Out[24]:
country age unit sex s_adj C2 C3 numeric iso Date Value
0 Austria TOTAL PC_ACT F NSA AT AUT 40 ISO 3166-2:AT 2020-06-01 6.3
1 Austria TOTAL PC_ACT M NSA AT AUT 40 ISO 3166-2:AT 2020-06-01 6.1
2 Austria TOTAL PC_ACT T NSA AT AUT 40 ISO 3166-2:AT 2020-06-01 6.2
3 Austria TOTAL THS_PER F NSA AT AUT 40 ISO 3166-2:AT 2020-06-01 136
4 Austria TOTAL THS_PER M NSA AT AUT 40 ISO 3166-2:AT 2020-06-01 146
In [25]:
data_tr.shape
Out[25]:
(197400, 11)
In [26]:
import re
data_tr['Value'] = data_tr['Value'].apply(lambda x: re.sub(r"[a-zA-Z: ]", "", x))
data_tr['Value'] = data_tr['Value'].apply(lambda x: x.replace(" ",""))

data_tr = data_tr.loc[~(data_tr.Value=="")]

data_tr['Value'] = data_tr['Value'].apply(lambda x: float(x))
In [27]:
len(data_tr['Value'].unique())
Out[27]:
5185

Estadísticas generales del desempleo en la unión Europea

In [28]:
data_per = data_tr[(data_tr["unit"]=='PC_ACT')&
                   (data_tr["s_adj"]=="TC")]
In [29]:
data_th = data_tr[(data_tr["unit"]=='THS_PER')&
                  (data_tr["s_adj"]=="TC")]
In [30]:
tot_per = pd.DataFrame(data_per.groupby(["age", "sex"])["Value"].mean()).reset_index()
In [31]:
tot_th = pd.DataFrame(data_th.groupby(["age", "sex"])["Value"].mean()).reset_index()
In [81]:
fig = px.bar(y=tot_per["Value"], x=tot_per["age"], color=tot_per["sex"], 
             barmode='group')
fig.update_layout(title="Porcentaje de desempleo en la Unión Europea",
                  xaxis_title="Grupo de edad",
                  yaxis_title="Porcentaje de desempleo",
                  legend_title="Sexo",
                  xaxis = dict(
                    tickvals = ['TOTAL', 'Y25-74', 'Y_LT25'],
                    ticktext = ['Total', 'Mayores de 25', 'Menores de 25']
                ))

fig.show()
In [66]:
fig = px.bar(y=tot_th["Value"], x=tot_th["age"], color=tot_th["sex"], 
             barmode='group')
fig.update_layout(title="Desempleo total en la Unión Europea",
                  xaxis_title="Grupo de edad",
                  yaxis_title="Miles de personas",
                  legend_title="Sexo",
                  xaxis = dict(
                    tickvals = ['TOTAL', 'Y25-74', 'Y_LT25'],
                    ticktext = ['Total', 'Mayores de 25', 'Menores de 25']
                ))

fig.show()
In [67]:
most_un = data_per.groupby("country")["Value"].mean().nlargest(10)
df_most = pd.DataFrame(most_un).reset_index()
df_most
Out[67]:
country Value
0 Spain 27.730185
1 Croatia 19.411204
2 Portugal 16.805556
3 Cyprus 15.891944
4 Slovakia 15.331944
5 Latvia 13.633519
6 France 13.529537
7 Ireland 13.205556
8 Turkey 12.830791
9 Lithuania 12.817593
In [80]:
fig = px.bar(y=df_most["country"], x=df_most["Value"], 
             color=df_most["country"], text=round(df_most["Value"], 2))
fig.update_layout(title="Países de la Unión Europea con un mayor desempleo",
                  xaxis_title="Porcentaje de desempleo",
                  yaxis_title="País")
fig.show()
In [72]:
less_un = data_per.groupby("country")["Value"].mean().nsmallest(10)[::-1]
df_less = pd.DataFrame(less_un).reset_index()
df_less
Out[72]:
country Value
0 Luxembourg 9.303704
1 Denmark 8.200000
2 Czech Republic 7.310556
3 Malta 6.900556
4 Netherlands 6.755370
5 Austria 6.506667
6 Iceland 5.888333
7 Switzerland 5.685755
8 Norway 5.521289
9 Germany 5.341019
In [79]:
fig = px.bar(y=df_less["country"], x=df_less["Value"], 
             color=df_less["country"], text=round(df_less["Value"], 2))
fig.update_layout(title="Países de la Unión Europea con un menor desempleo",
                  xaxis_title="Porcentaje de desempleo",
                  yaxis_title="País")
fig.show()
In [41]:
total_y25_74 = data_tr.loc[(data_tr.age=='Y25-74')&
                           (data_tr.unit=='PC_ACT')&
                           (data_tr.sex=='T')&
                           (data_tr.s_adj=='TC')]
In [42]:
def plot_time_variation(df, y='Value', size=1, is_log=False, title=""):
    f, ax = plt.subplots(1,1, figsize=(4*size,3*size))

    countries = list(df.country.unique())
    for country in countries:
        df_ = df[(df['country']==country)] 
        g = sns.lineplot(x="Date", y=y, data=df_,  label=country)  
        ax.text(max(df_['Date']), (df_.loc[df_['Date']==max(df_['Date']), y]), str(country))
    plt.xticks(rotation=90)
    plt.title(f'Desempleo total, {title}, agrupado por país')
    ax.text(max(df_['Date']), (df_.loc[df_['Date']==max(df_['Date']), y]), str(country))
    plt.legend(loc="upper left", bbox_to_anchor=(1,1))
    if(is_log):
        ax.set(yscale="log")
    ax.grid(color='black', linestyle='dotted', linewidth=0.75)
    ax.set_xlabel("Fecha")
    ax.set_ylabel("Valor")
    plt.show()
In [43]:
plot_time_variation(total_y25_74, size=4, 
                    is_log=True, title = "edad 24-75 -")
In [44]:
total_F_y25_74 = data_tr.loc[(data_tr.age=='Y25-74')&
                             (data_tr.unit=='PC_ACT')&
                             (data_tr.sex=='F')&
                             (data_tr.s_adj=='TC')]
plot_time_variation(total_F_y25_74, size=4, is_log=True, 
                    title = "mujeres, edad 24-75 ")
In [45]:
total_M_y25_74 = data_tr.loc[(data_tr.age=='Y25-74')&
                             (data_tr.unit=='PC_ACT')&
                             (data_tr.sex=='M')&
                             (data_tr.s_adj=='TC')]

plot_time_variation(total_M_y25_74, size=4, is_log=True, 
                    title = "hombres, edad 24-75 ")
In [46]:
total_M_y25 = data_tr.loc[(data_tr.age=='Y_LT25')&
                          (data_tr.unit=='PC_ACT')&
                          (data_tr.sex=='M')&
                          (data_tr.s_adj=='TC')]

plot_time_variation(total_M_y25, size=4, is_log=True, 
                    title = "hombres, menores de 25 años ")
In [47]:
total_F_y25 = data_tr.loc[(data_tr.age=='Y_LT25')&
                          (data_tr.unit=='PC_ACT')&
                          (data_tr.sex=='F')&
                          (data_tr.s_adj=='TC')]

plot_time_variation(total_F_y25, size=4, is_log=True, 
                    title = "mujeres, menores de 25 años")
In [65]:
data_tr = data_tr[data_tr["s_adj"]=='TC']
data_tr.head()
Out[65]:
country age unit sex s_adj C2 C3 numeric iso Date Value
36 Austria TOTAL PC_ACT F TC AT AUT 40 ISO 3166-2:AT 2020-06-01 5.0
37 Austria TOTAL PC_ACT M TC AT AUT 40 ISO 3166-2:AT 2020-06-01 6.4
38 Austria TOTAL PC_ACT T TC AT AUT 40 ISO 3166-2:AT 2020-06-01 5.7
39 Austria TOTAL THS_PER F TC AT AUT 40 ISO 3166-2:AT 2020-06-01 104.0
40 Austria TOTAL THS_PER M TC AT AUT 40 ISO 3166-2:AT 2020-06-01 151.0
In [70]:
data_yr = data_per.groupby([data_tr['Date'].map(lambda x: x.year), "country", "age", "sex"]).mean()
dt_yr = data_yr.drop("numeric", axis=1).reset_index()
dt_yr
Out[70]:
Date country age sex Value
0 2010 Austria TOTAL F 4.633333
1 2010 Austria TOTAL M 4.750000
2 2010 Austria TOTAL T 4.683333
3 2010 Austria Y25-74 F 3.816667
4 2010 Austria Y25-74 M 4.116667
... ... ... ... ... ...
2799 2020 Switzerland Y_LT25 M 8.733333
2800 2020 Switzerland Y_LT25 T 8.000000
2801 2020 Turkey TOTAL T 12.925000
2802 2020 Turkey Y25-74 T 10.900000
2803 2020 Turkey Y_LT25 T 24.375000

2804 rows × 5 columns

Los 3 Países con mayor desempleo en la unión europea

In [113]:
def plot_time_variation_age_sex(data_tr_df, y='Value', country="Netherlands"):
    c_df = data_tr_df.loc[(data_tr_df.country==country)&
                          (data_tr_df.unit=='PC_ACT')&
                          (data_tr_df.s_adj=='TC')]
    
    f, ax = plt.subplots(1,1, figsize=(16,12))
    sns.lineplot(x="Date", y=y, data=c_df.loc[(c_df.age=='Y_LT25')&
                                              (c_df.sex=='F')],  
                 label="Mujeres, menores de 25 años")  
    
    sns.lineplot(x="Date", y=y, data=c_df.loc[(c_df.age=='Y_LT25')&
                                              (c_df.sex=='M')],  
                 label="Hombres, menores de 25 años")  
    
    sns.lineplot(x="Date", y=y, data=c_df.loc[(c_df.age=='Y25-74')&
                                              (c_df.sex=='F')],  
                 label="Mujeres, mayores de 25 años")  
    
    sns.lineplot(x="Date", y=y, data=c_df.loc[(c_df.age=='Y25-74')&
                                              (c_df.sex=='M')],  
                 label="Hombres, mayores de 25 años")  

    plt.xticks(rotation=90)
    plt.title(f'Desempleo total en {country}, agrupados por edad y sexo')
    plt.legend(loc="upper left", bbox_to_anchor=(1,1))
    ax.grid(color='black', linestyle='dotted', linewidth=0.75)
    ax.set_xlabel("Fecha")
    ax.set_ylabel("Porcentaje")
    plt.show()  
In [83]:
s_yr = dt_yr.groupby(["Date","sex","age"])["Value"].mean()
df_yr = pd.DataFrame(s_yr).reset_index()
In [112]:
plt.figure(figsize=(12,9))
ax = sns.lineplot(x="Date", y="Value", hue="age", style="sex", data=df_yr)
ax.set_title("Evolución del desempleo en la Unión Europea")
ax.set_xlabel("Fecha")
ax.set_ylabel("Porcentaje");
In [101]:
data_1920 = data_tr[data_tr["Date"]>pd.to_datetime("2019-02")]

España

In [114]:
plot_time_variation_age_sex(data_tr, country="Spain")
In [115]:
plot_time_variation_age_sex(data_1920, country="Spain")

Croacia

In [116]:
plot_time_variation_age_sex(data_tr, country="Croatia")
In [117]:
plot_time_variation_age_sex(data_1920, country="Croatia")

Portugal

In [118]:
plot_time_variation_age_sex(data_tr,country="Portugal")
In [119]:
plot_time_variation_age_sex(data_1920, country="Portugal")

Los 3 Países con menor desempleo en la unión europea

Suiza

In [120]:
plot_time_variation_age_sex(data_tr,country="Switzerland")
In [121]:
plot_time_variation_age_sex(data_1920, country="Switzerland")

Noruega

In [122]:
plot_time_variation_age_sex(data_tr,country="Norway")
In [123]:
plot_time_variation_age_sex(data_1920,country="Norway")

Alemania

In [124]:
plot_time_variation_age_sex(data_tr, country="Germany")
In [126]:
plot_time_variation_age_sex(data_1920, country="Germany")